Front matter
This submission is my work alone and complies with the 30531 integrity policy.
Add your initials to indicate your agreement: CS, YC, DF
# LOAD LIBRARIES HERE
library(tidyverse)
library(nycflights13)
library(maps)
library(testthat)
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_…
## 2 06A Moton Field Municip… 32.5 -85.7 264 -6 A America/Chic…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chic…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_…
## 5 09J Jekyll Island Airpo… 31.1 -81.4 11 -5 A America/New_…
## 6 0A9 Elizabethton Munici… 36.4 -82.2 1593 -5 A America/New_…
## 7 0G6 Williams County Air… 41.5 -84.5 730 -5 A America/New_…
## 8 0G7 Finger Lakes Region… 42.9 -76.8 492 -5 A America/New_…
## 9 0P2 Shoestring Aviation… 39.8 -76.6 1000 -5 U America/New_…
## 10 0S9 Jefferson County In… 48.1 -123. 108 -8 A America/Los_…
## # ... with 1,448 more rows
Answers: If I were to draw the route each plane flies from its origin to its destination, I would use the “dest” and “origin” variables in the “flights” table to identify planes that fly from its orgins to its destinations. I would also use the “lat” and “lon” variables in the “airports” table to identify the route each plane flies. Then I would join the two tables and add “origins” and “dest” variables on to the new dataset.
knitr::include_graphics("/project2/ppha30531/caiy/ps6.JPG", dpi = NA)
Answers: The “faa” variable in the “airports” table is matched with the “origin” variable in the “weather” table.
average_delay_by_destination <-
flights %>%
group_by(dest) %>%
summarise(avg_delay = mean(arr_delay, na.rm = TRUE))
average_delay_by_destination <-
inner_join(airports, average_delay_by_destination, by = c("faa" = "dest"))
average_delay_by_destination %>%
ggplot(aes(lon, lat, size = avg_delay)) +
borders("state") +
geom_point(colour = "pink") +
coord_quickmap()
## Warning: Removed 1 rows containing missing values (geom_point).
prepare_join_lat <-
airports %>%
select("faa", "lat")
prepare_join_lon <-
airports %>%
select("faa", "lon")
flights %>%
left_join(prepare_join_lat, by = c("origin" = "faa")) %>%
left_join(prepare_join_lon, by = c("dest" = "faa"))
## # A tibble: 336,776 x 21
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, lat <dbl>, lon <dbl>
planes <-
planes %>%
mutate(age_of_planes = 2013 - year)
planes_tailnum_age <-
planes %>%
select(tailnum, age_of_planes)
flights %>%
inner_join(planes_tailnum_age, by = "tailnum") %>%
group_by(age_of_planes) %>%
summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = age_of_planes, y = avg_delay)) +
geom_point(colour = "pink") +
geom_line() +
labs(title = "Relationship Between Planes Age and Delays")
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
Answers: According to the graph above, I do not notice a trent between age and delay. It seems that the relationship is rather random.
weather_join <-
weather %>%
select("origin", "year", "month", "day", "hour", "precip")
flights %>%
inner_join(weather_join, by = c("origin" = "origin",
"year" = "year",
"month" = "month",
"day" = "day",
"hour" = "hour")) %>%
group_by(precip) %>%
summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = precip, y = avg_delay)) +
geom_point(colour = "pink") +
geom_line() +
labs(title = "Relationship Between Weather Conditions and Delays")
Answers: According to the graph above, I do not notice a trent between weather conditions and delay. It seems that the relationship is rather random.
June_13_delay <-
flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(avg_delay = mean(arr_delay, na.rm = TRUE))
June_13_delay <-
inner_join(airports, June_13_delay, by = c("faa" = "dest"))
June_13_delay %>%
ggplot(aes(lon, lat, size = avg_delay, colour = avg_delay)) +
borders("state") +
geom_point() +
coord_quickmap()
## Warning: Removed 3 rows containing missing values (geom_point).
Answers: It seems that there was a large amount of precipitation concentrating in Eastern US. Actually that was the truth after researching on GOOGLE. According to https://en.wikipedia.org/wiki/June_12%E2%80%9313,_2013_derecho_series, there were two recho occurred across different areas of the Eastern United States, resulting in major wind damage across North Carolina, Virginia, Maryland, etc.
Answers: It seems that many carriers do not report tail numbers, which causes so many missing tailnum.
flight_tailnum <-
flights %>%
anti_join(planes, by = "tailnum")
flight_tailnum %>%
count(carrier)
## # A tibble: 10 x 2
## carrier n
## <chr> <int>
## 1 9E 1044
## 2 AA 22558
## 3 B6 830
## 4 DL 110
## 5 F9 50
## 6 FL 187
## 7 MQ 25397
## 8 UA 1693
## 9 US 699
## 10 WN 38
Answers: It seems that many carriers have missing values for tailnum, meaning that they probably did not report tail numbers. Of those carrier who do not report, MQ and AA have the majority of the missing values.
Answers: Those two arguement will give different result. The first one will give basically flights that arrive/depart at an airport that is not a faa one; the second will give an airport that has no “dest” which is bascally no flights in the US.
–anti_join(flights, airports, by = c(“dest” = “faa”)) returns all rows from the “flights” dataset where there are not matching values in the “airports” dataset, by matching keys of “dest” in “flights” to “faa” in “airports” and keeping just columns from the “flights” dataset. –anti_join(airports, flights, by = c(“faa” = “dest”)) returns all rows from the “airporrs” dataset where there are not matching values in the “flights” dataset, by matching keys of “faa” in “airports” to “dest” in “flights” and keeping just columns from the “airports” dataset.
flights %>%
filter(!is.na(tailnum)) %>%
count(tailnum, carrier)
## # A tibble: 4,060 x 3
## tailnum carrier n
## <chr> <chr> <int>
## 1 D942DN DL 4
## 2 N0EGMQ MQ 371
## 3 N10156 EV 153
## 4 N102UW US 48
## 5 N103US US 46
## 6 N104UW US 47
## 7 N10575 EV 289
## 8 N105UW US 45
## 9 N107US US 41
## 10 N108UW US 60
## # ... with 4,050 more rows
flights %>%
filter(!is.na(tailnum)) %>%
count(tailnum)
## # A tibble: 4,043 x 2
## tailnum n
## <chr> <int>
## 1 D942DN 4
## 2 N0EGMQ 371
## 3 N10156 153
## 4 N102UW 48
## 5 N103US 46
## 6 N104UW 47
## 7 N10575 289
## 8 N105UW 45
## 9 N107US 41
## 10 N108UW 60
## # ... with 4,033 more rows
Answers: We can see that the first result gives us 4016 rows of data whereas the second result gives us 4043 rows of data, meaning that there are 4043-4016 = 17 flights that are flown by more than one airline.
flight_100_rows <-
flights %>%
slice(1:100)
weather %>%
left_join(flight_100_rows, by = "year")
## # A tibble: 2,611,500 x 33
## origin.x year month.x day.x hour.x temp dewp humid wind_dir
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 2 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 3 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 4 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 5 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 6 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 7 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 8 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 9 EWR 2013 1 1 1 39.0 26.1 59.4 270
## 10 EWR 2013 1 1 1 39.0 26.1 59.4 270
## # ... with 2,611,490 more rows, and 24 more variables: wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour.x <dttm>, month.y <int>, day.y <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin.y <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour.y <dbl>, minute <dbl>, time_hour.y <dttm>
Answers: There are 2,611,500 rows there in the new dataset, and it takes about 2 seconds, which is comparatively long, for R to do this join.
13.Question: Describe the result of (but do not attempt to run!) using left_join to merge flights and weather based on year. Without actually running the code, but using what you know about the datasets and 1 your answer to the previous question, how many rows will there be and how long it will take to run this code?
Answers: It will take a long time for R to do this join. The ultimate dataset would include 26,115(how many rows in “weather”) * 336,776 (how many rows in “flight”) rows, which is HUGE.
first100000 <- read_csv("/project2/ppha30531/data/ticket_data/processed/parking_tickets.csv",n_max = 100000, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## .default = col_character(),
## ticket_number = col_integer(),
## issue_date = col_datetime(format = ""),
## unit = col_integer(),
## fine_level1_amount = col_integer(),
## fine_level2_amount = col_integer(),
## current_amount_due = col_double(),
## total_payments = col_double(),
## ticket_queue_date = col_datetime(format = ""),
## notice_number = col_double()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 25646 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 4739 ticket_nu… an integ… 9058163… '/project2/ppha30531/data/ticket_da… file 2 4771 ticket_nu… an integ… 9058163… '/project2/ppha30531/data/ticket_da… row 3 4779 ticket_nu… an integ… 9058163… '/project2/ppha30531/data/ticket_da… col 4 4795 ticket_nu… an integ… 9058163… '/project2/ppha30531/data/ticket_da… expected 5 4805 ticket_nu… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
(problem <- problems( read_csv("/project2/ppha30531/data/ticket_data/processed/parking_tickets.csv",n_max = 100000, trim_ws = TRUE)))
## Parsed with column specification:
## cols(
## .default = col_character(),
## ticket_number = col_integer(),
## issue_date = col_datetime(format = ""),
## unit = col_integer(),
## fine_level1_amount = col_integer(),
## fine_level2_amount = col_integer(),
## current_amount_due = col_double(),
## total_payments = col_double(),
## ticket_queue_date = col_datetime(format = ""),
## notice_number = col_double()
## )
## See spec(...) for full column specifications.
## # A tibble: 25,646 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 4739 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 2 4771 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 3 4779 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 4 4795 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 5 4805 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 6 4864 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 7 4868 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 8 4896 ticket_n… an integ… 9058163… '/project2/ppha30531/data/ticket_da…
## 9 4937 ticket_n… an integ… 9058132… '/project2/ppha30531/data/ticket_da…
## 10 5007 ticket_n… an integ… 9058132… '/project2/ppha30531/data/ticket_da…
## # ... with 25,636 more rows
first100000 <-
read_csv("/project2/ppha30531/data/ticket_data/processed/parking_tickets.csv",
n_max = 100000,
trim_ws = TRUE,
col_types = cols(ticket_number = col_double())
)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 19 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 15862 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… file 2 16440 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… row 3 67437 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… col 4 67453 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… expected 5 67577 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
(problem2 <-
problems(read_csv("/project2/ppha30531/data/ticket_data/processed/parking_tickets.csv",
n_max = 100000,
trim_ws = TRUE,
col_types = cols(ticket_number = col_double())
)
)
)
## # A tibble: 19 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 15862 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 2 16440 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 3 67437 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 4 67453 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 5 67577 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 6 67973 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 7 68429 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 8 69091 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 9 76896 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 10 76921 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 11 77020 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 12 78054 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 13 79290 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 14 79615 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 15 79701 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 16 88087 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 17 88857 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 18 89302 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
## 19 89650 unit an integer NULL '/project2/ppha30531/data/ticket_data/pr…
#read in the file and record time
t_start <- Sys.time()
tickets_1pct <-
read_csv("/project2/ppha30531/data/ticket_data/processed/parking_tickets.csv",
col_types = cols(ticket_number = col_double())
) %>%
filter(ticket_number %% 100 == 1)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1829 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 15862 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… file 2 16440 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… row 3 67437 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… col 4 67453 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro… expected 5 67577 unit an integer NULL '/project2/ppha30531/data/ticket_data/pro…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
t_end <- Sys.time()
t_taken <- t_end - t_start
t_taken
## Time difference of 3.253941 mins
#https://stackoverflow.com/questions/6262203/measuring-function-execution-time-in-r
#test the number of rows
test_that("we have the right number of rows", expect_equal(nrow(tickets_1pct), 287458))
write.csv(tickets_1pct, file = "/project2/ppha30531/caiy/tickets_1pct.csv")
print(tickets_1pct)
## # A tibble: 287,458 x 23
## ticket_number issue_date violation_locat… license_plate_n…
## <dbl> <dttm> <chr> <chr>
## 1 51482901 2007-01-01 01:25:00 5762 N AVONDALE d41ee9a4cb0676e…
## 2 50681501 2007-01-01 01:51:00 2724 W FARRAGUT 3395fd3f71f18f9…
## 3 51579701 2007-01-01 02:22:00 1748 W ESTES 302cb9c55f63ff8…
## 4 51262201 2007-01-01 02:35:00 4756 N SHERIDAN 94d018f52c7990c…
## 5 51898001 2007-01-01 03:50:00 7134 S CAMPBELL 876dd3a95179f4f…
## 6 50681401 2007-01-01 04:10:00 2227 W FOSTERT 5fb25a5bb6bbd31…
## 7 51226001 2007-01-01 04:36:00 1411 S KOSTNER 603e09c12c607a2…
## 8 51376701 2007-01-01 05:40:00 6954 S ASHLAND a0c68892489c2f0…
## 9 51262301 2007-01-01 06:00:00 2630 N CANNON DR bdc8d83699eac18…
## 10 51226201 2007-01-01 08:35:00 4401 W 28TH STR… f65a21b9250fc93…
## # ... with 287,448 more rows, and 19 more variables:
## # license_plate_state <chr>, license_plate_type <chr>, zipcode <chr>,
## # violation_code <chr>, violation_description <chr>, unit <int>,
## # unit_description <chr>, vehicle_make <chr>, fine_level1_amount <int>,
## # fine_level2_amount <int>, current_amount_due <dbl>,
## # total_payments <dbl>, ticket_queue <chr>, ticket_queue_date <dttm>,
## # notice_level <chr>, hearing_disposition <chr>, notice_number <dbl>,
## # officer <chr>, address <chr>
colSums(is.na(tickets_1pct))
## ticket_number issue_date violation_location
## 0 0 0
## license_plate_number license_plate_state license_plate_type
## 0 97 2054
## zipcode violation_code violation_description
## 54115 0 0
## unit unit_description vehicle_make
## 29 0 0
## fine_level1_amount fine_level2_amount current_amount_due
## 0 0 0
## total_payments ticket_queue ticket_queue_date
## 0 0 0
## notice_level hearing_disposition notice_number
## 84068 259899 0
## officer address
## 0 0
tickets_1pct_by_year <-
tickets_1pct %>%
mutate(year = str_sub(issue_date, 1, 4) )%>%
group_by(year)
summarise(tickets_1pct_by_year, n = n())
## # A tibble: 12 x 2
## year n
## <chr> <int>
## 1 2007 29252
## 2 2008 28713
## 3 2009 27335
## 4 2010 25324
## 5 2011 24898
## 6 2012 24406
## 7 2013 25983
## 8 2014 24552
## 9 2015 24227
## 10 2016 22643
## 11 2017 22364
## 12 2018 7761
top_20 <-
tickets_1pct %>%
group_by(violation_description) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
slice(1:20)
ggplot(top_20, aes(x = reorder(violation_description, n), y = n)) + geom_col() + coord_flip()
tickets_1pct %>%
filter(is.na(unit) == TRUE) %>%
nrow()
## [1] 29
tickets_1pct %>%
select(unit) %>% distinct() %>%
nrow()
## [1] 129
29 tickets have missing units - negligible.
Read in unit_key.csv. How many units are there?
unit_key <-
read_csv("/project2/ppha30531/data/ticket_data/unit_key.csv",
col_names = c("reporting_district","department_name",
"department_description","department_category",
"dummy","reporting_district","department_category")) %>%
tail(-3) %>% #get rid of redundant rows
select(reporting_district, department_name, department_description)
## Warning: Duplicated column names deduplicated: 'reporting_district'
## => 'reporting_district_1' [6], 'department_category' =>
## 'department_category_1' [7]
## Parsed with column specification:
## cols(
## reporting_district = col_character(),
## department_name = col_character(),
## department_description = col_character(),
## department_category = col_character(),
## dummy = col_character(),
## reporting_district_1 = col_character(),
## department_category_1 = col_character()
## )
let unit be each distinct ticket-issuing authority. then the number of units is:
unit_key %>%
select(department_description) %>%
distinct() %>%
nrow()
## [1] 186
if by unit we mean each unique unit in the key:
unit_key %>%
distinct() %>%
nrow()
## [1] 385
tickets_joined <-
tickets_1pct %>%
mutate(unit = as.factor(unit),
X = NULL) %>%
full_join(unit_key, by = c("unit" = "reporting_district"))
## Warning: Column `unit`/`reporting_district` joining factor and character
## vector, coercing into character vector
ticket rows with matches in key:
tickets_joined %>%
filter(is.na(department_description) == FALSE &
is.na(ticket_number) == FALSE) %>%
nrow()
## [1] 287458
which is every row in tickets before reading in, so every row had a match in unit_key
Tick
tickets_joined %>%
filter(is.na(ticket_number) == TRUE) %>%
nrow()
## [1] 256
256 rows have N/As for their ticket numbers, meaning they were created when the full_join couldn’t find a match in on the left hand for the right hand members i.e. unit_keys not used.
Over the whole period in the dataset
tickets_joined %>%
filter(department_name == "DOF" | department_name == "CPD") %>%
group_by(department_name) %>% tally()
## # A tibble: 2 x 2
## department_name n
## <chr> <int>
## 1 CPD 120716
## 2 DOF 143909
Department of Finance issues more tickets.
tickets_joined %>%
filter(department_name == "CPD") %>%
group_by(unit,department_description) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 3
## # Groups: unit [5]
## unit department_description n
## <chr> <chr> <int>
## 1 18 1160 N. Larrabee 9478
## 2 24 6464 N. Clark 7946
## 3 252 OEMC 7374
## 4 10 3315 W. Ogden 5469
## 5 25 5555 W. Grand 5464
OEMC is the office of emergency managment, the rest are precincts.
getting 5 estimates from the ACS, since that’s the most recent data-set I can find that breaks down by ZIP code. 1.
census_data_zip <- read_csv(file = "/project2/ppha30531/caiy/ACS_16_5YR_S1903_with_ann.csv", na = "-")
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
census_data_zip <-
census_data_zip %>%
select("GEO.display-label", HC01_EST_VC02, HC02_EST_VC02,HC01_EST_VC05) %>%
mutate(
zipcode = as.character(str_trim(str_replace(`GEO.display-label`,"ZCTA5",""))),
share_black = as.numeric(HC01_EST_VC05)/100,
households = as.numeric(HC01_EST_VC02))
## Warning in evalq(as.numeric(HC01_EST_VC05)/100, <environment>): NAs
## introduced by coercion
## Warning in evalq(as.numeric(HC01_EST_VC02), <environment>): NAs introduced
## by coercion
cleaning tickets:
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/substr.html
tickets_joined <-
tickets_joined %>%
mutate(zipcode = str_trim(substr(zipcode, 1, 5)))
now we merge
tickets_joined <- left_join(tickets_joined,census_data_zip, by = "zipcode")
unpaid_rank <-
tickets_joined %>%
group_by(zipcode) %>%
add_tally() %>%
mutate(rate = n/households) %>%
select(rate, zipcode, share_black, HC02_EST_VC02) %>%
rename(median_household_income = HC02_EST_VC02) %>%
group_by(zipcode) %>%
summarize(
rate = mean(rate, na.rm = TRUE),
share_black = mean(share_black, na.rm = TRUE),
median_household_income = mean(as.numeric(median_household_income), na.rm = TRUE)
) %>%
arrange(desc(rate))
unpaid_rank %>% head(6) %>% tail(5)
## # A tibble: 5 x 4
## zipcode rate share_black median_household_income
## <chr> <dbl> <dbl> <dbl>
## 1 60651 0.248 0.603 34397
## 2 60636 0.243 0.926 27475
## 3 60624 0.235 0.94 22160
## 4 60639 0.234 0.174 39913
## 5 60623 0.224 0.365 30048
which correspond roughly to (respectively), west hubolt park and austin, west englewood, west garfield park, hanson park, and lawndale.
#http://alanzablocki.com/projects/chicago-crime-r/
data(zipcode)
unpaid_rank$zipcode <- clean.zipcodes(unpaid_rank$zipcode)
unpaid_zip <- left_join(unpaid_rank, zipcode, by = c("zipcode"="zip"))
ggmap(chicago) + geom_point(unpaid_zip,
mapping = aes(x = longitude,
y = latitude,
alpha = rate,
size = 0.6,
color = "red")
)
knitr::include_graphics("/project2/ppha30531/caiy/chicago.PNG", dpi = NA)